import re
import sys
import warnings
import pandas as pd
from pandasql import sqldf
# Ignore all warnings
warnings.filterwarnings("ignore")
import os
import pyodbc
import pandas as pd
import pandera as pa
import csv
#from bs4 import BeautifulSoup
#from pandera import DataFrameSchema,Column, Check, Index, MultiIndex
#import sqlalchemy;
from os import environ;
pd.set_option('display.max_columns', None)
dirpath = "./"
# Check if directory exists
if not os.path.exists(dirpath):
raise FileNotFoundError(f"The directory {dirpath} does not exist.")
else:
print(f"The directory {dirpath} exists.")
# List all files in the directory
files_in_dir = os.listdir(dirpath)
print("Files in directory:", files_in_dir)
all_filename = 'Counties_Percapita_Formatted.csv'
singlefamily_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv'
ThreeBedRoom_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_ThreeBedRoom_tier_0.33_0.67_sm_sa_month.csv.csv'
threebed_filename = 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv'
twobed_filename = 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv'
condo_filename = 'Counties_Percapita_FormattedZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv'
all_filepath = os.path.join(dirpath, all_filename)
allhomes_df = pd.read_csv(all_filepath, header=0) # Read without headers
sf_filepath = os.path.join(dirpath, singlefamily_filename)
sf_df = pd.read_csv(sf_filepath, header=0) # Read without headers
condo_filepath = os.path.join(dirpath, condo_filename)
condo_df = pd.read_csv(condo_filepath, header=0) # Read without headers
threebed_filepath = os.path.join(dirpath, threebed_filename)
threebed_df = pd.read_csv(threebed_filepath, header=0) # Read without headers
twobed_filepath = os.path.join(dirpath, twobed_filename)
twobed_df = pd.read_csv(twobed_filepath, header=0) # Read without headers
The directory C:\Users\req92163\OneDrive - Mutual of Omaha\DSCI8950\DS_2025_TeamE\data_files exists. Files in directory: ['bkp_files', 'cfips_location.csv', 'Counties_Percapita_Formatted.csv', 'Counties_Percapita_FormattedZipcode_zhvi_sfcondo_tier.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv', 'Counties_Percapita_FormattedZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZipcode_zhvi_sfcondo_tier.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_uc_condo_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Fmatted_IncomeZip_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_month.csv.csv', 'percapital_Formatted_Income.csv', 'Percapita_Income.csv', 'test.txt', 'zillow_combined_zhvi.csv']
twobed_df.rename(columns={
'HouseValue': 'AnnualValue_TwoBedRoom',
'prev_HouseValue': 'Previous_AnnualValue_TwoBedRoom',
'annual_increase': 'Annual_Increase_TwoBedRoom',
'BasePrice': '5Year_BasePrice_TwoBedRoom',
'increase_base_pct': '5Year_Percent_Increase_TwoBedRoom',
'Rank_Annual': 'Annual_Rank_TwoBedRoom',
'annual_value_category': 'Annual_Value_Category_TwoBedRoom'
}, inplace=True)
#twobed_df.columns
threebed_df.rename(columns={
'HouseValue': 'AnnualValue_ThreeBedRoom',
'prev_HouseValue': 'Previous_AnnualValue_ThreeBedRoom',
'annual_increase': 'Annual_Increase_ThreeBedRoom',
'BasePrice': '5Year_BasePrice_ThreeBedRoom',
'increase_base_pct': '5Year_Percent_Increase_ThreeBedRoom',
'Rank_Annual': 'Annual_Rank_ThreeBedRoom',
'annual_value_category': 'Annual_Value_Category_ThreeBedRoom'
}, inplace=True)
#threebed_df.columns
allhomes_df.rename(columns={
'HouseValue': 'AnnualValue_AllHomes',
'prev_HouseValue': 'Previous_AnnualValue_AllHomes',
'annual_increase': 'Annual_Increase_AllHomes',
'BasePrice': '5Year_BasePrice_AllHomes',
'increase_base_pct': '5Year_Percent_Increase_AllHomes',
'Rank_Annual': 'Annual_Rank_AllHomes',
'annual_value_category': 'Annual_Value_Category_AllHomes'
}, inplace=True)
#allhomes_df.columns
sf_df.rename(columns={
'HouseValue': 'AnnualValue_SingleFamily',
'prev_HouseValue': 'Previous_AnnualValue_SingleFamily',
'annual_increase': 'Annual_Increase_SingleFamily',
'BasePrice': '5Year_BasePrice_SingleFamily',
'increase_base_pct': '5Year_Percent_Increase_SingleFamily',
'Rank_Annual': 'Annual_Rank_SingleFamily',
'annual_value_category': 'Annual_Value_Category_SingleFamily'
}, inplace=True)
#sf_df.columns
condo_df.rename(columns={
'HouseValue': 'AnnualValue_Condo',
'prev_HouseValue': 'Previous_AnnualValue_Condo',
'annual_increase': 'Annual_Increase_Condo',
'BasePrice': '5Year_BasePrice_Condo',
'increase_base_pct': '5Year_Percent_Increase_Condo',
'Rank_Annual': 'Annual_Rank_Condo',
'annual_value_category': 'Annual_Value_Category_Condo'
}, inplace=True)
condo_df.columns
Index(['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng',
'Year', '5Year_BasePrice_Condo', 'AnnualValue_Condo',
'Previous_AnnualValue_Condo', '5Year_Percent_Increase_Condo',
'Annual_Increase_Condo', 'Annual_Rank_Condo',
'Annual_Value_Category_Condo', 'Personal_Income_Raise',
'Population_Raise', 'Percapita_Income_Raise'],
dtype='object')
# Initialize merged_df with allhomes_df
merged_df = allhomes_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year',
'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise',
'5Year_BasePrice_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes',
'5Year_Percent_Increase_AllHomes', 'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes',
'Annual_Value_Category_AllHomes']].copy()
# Select only 'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise' columns from allhomes_df
merged_df = merged_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year',
'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise',
'5Year_BasePrice_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes',
'5Year_Percent_Increase_AllHomes', 'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes',
'Annual_Value_Category_AllHomes']]
# Merge all dataframes on the specified keys
#keys = ['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year', 'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise']
keys = ['CountyName', 'StateName', 'Year']
merged_df = merged_df.merge(sf_df, on=keys, how='outer', suffixes=('', '_SingleFamily'))
merged_df = merged_df.merge(condo_df, on=keys, how='inner', suffixes=('', '_Condo'))
merged_df = merged_df.merge(threebed_df, on=keys, how='outer', suffixes=('', '_ThreeBedRoom'))
merged_df = merged_df.merge(twobed_df, on=keys, how='outer', suffixes=('', '_TwoBedRoom'))
merged_df.rename(columns={
'CountyName': 'County',
'StateName': 'State',
'FIPS': 'FIPS_Code',
'City': 'City_Name',
'Metro': 'Metropolitan_Area',
'lat': 'Latitude',
'lng': 'Longitude',
'Year': 'Year_Recorded',
'Personal_Income_Raise': 'Personal_Income_Growth',
'Population_Raise': 'Population_Growth',
'Percapita_Income_Raise': 'Per_Capita_Income_Growth'
},inplace=True)
# Display the first few rows of the merged dataframe
# Initialize merged_df with allhomes_df
merged_df = allhomes_df[['CountyName', 'StateName', 'FIPS', 'City', 'Metro', 'lat', 'lng', 'Year',
'Personal_Income_Raise', 'Population_Raise', 'Percapita_Income_Raise',
'5Year_BasePrice_AllHomes', '5Year_Percent_Increase_AllHomes', 'AnnualValue_AllHomes', 'Previous_AnnualValue_AllHomes',
'Annual_Increase_AllHomes', 'Annual_Rank_AllHomes',
'Annual_Value_Category_AllHomes']].copy()
# Merge all dataframes on the specified keys
keys = ['CountyName', 'StateName', 'Year']
merged_df = merged_df.merge(sf_df[['CountyName', 'StateName', 'Year',
'5Year_BasePrice_SingleFamily','5Year_Percent_Increase_SingleFamily', 'AnnualValue_SingleFamily',
'Previous_AnnualValue_SingleFamily',
'Annual_Increase_SingleFamily', 'Annual_Rank_SingleFamily',
'Annual_Value_Category_SingleFamily']], on=keys, how='outer')
merged_df = merged_df.merge(condo_df[['CountyName', 'StateName', 'Year',
'5Year_BasePrice_Condo', '5Year_Percent_Increase_Condo', 'AnnualValue_Condo',
'Previous_AnnualValue_Condo',
'Annual_Increase_Condo', 'Annual_Rank_Condo',
'Annual_Value_Category_Condo']], on=keys, how='outer')
merged_df = merged_df.merge(threebed_df[['CountyName', 'StateName', 'Year',
'5Year_BasePrice_ThreeBedRoom','5Year_Percent_Increase_ThreeBedRoom', 'AnnualValue_ThreeBedRoom',
'Previous_AnnualValue_ThreeBedRoom',
'Annual_Increase_ThreeBedRoom', 'Annual_Rank_ThreeBedRoom',
'Annual_Value_Category_ThreeBedRoom']], on=keys, how='outer')
merged_df = merged_df.merge(twobed_df[['CountyName', 'StateName', 'Year',
'5Year_BasePrice_TwoBedRoom','5Year_Percent_Increase_TwoBedRoom', 'AnnualValue_TwoBedRoom',
'Previous_AnnualValue_TwoBedRoom',
'Annual_Increase_TwoBedRoom', 'Annual_Rank_TwoBedRoom',
'Annual_Value_Category_TwoBedRoom']], on=keys, how='outer')
merged_df.rename(columns={
'CountyName': 'County',
'StateName': 'State',
'FIPS': 'FIPS_Code',
'City': 'City_Name',
'Metro': 'Metropolitan_Area',
'lat': 'Latitude',
'lng': 'Longitude',
'Year': 'Year_Recorded',
'Personal_Income_Raise': 'Personal_Income_Growth',
'Population_Raise': 'Population_Growth',
'Percapita_Income_Raise': 'Per_Capita_Income_Growth'
}, inplace=True)
# Reorder columns
#merged_df = merged_df[column_order]
# Display the first few rows of the merged dataframe
merged_df.head(15)
| County | State | FIPS_Code | City_Name | Metropolitan_Area | Latitude | Longitude | Year_Recorded | Personal_Income_Growth | Population_Growth | Per_Capita_Income_Growth | 5Year_BasePrice_AllHomes | 5Year_Percent_Increase_AllHomes | AnnualValue_AllHomes | Previous_AnnualValue_AllHomes | Annual_Increase_AllHomes | Annual_Rank_AllHomes | Annual_Value_Category_AllHomes | 5Year_BasePrice_SingleFamily | 5Year_Percent_Increase_SingleFamily | AnnualValue_SingleFamily | Previous_AnnualValue_SingleFamily | Annual_Increase_SingleFamily | Annual_Rank_SingleFamily | Annual_Value_Category_SingleFamily | 5Year_BasePrice_Condo | 5Year_Percent_Increase_Condo | AnnualValue_Condo | Previous_AnnualValue_Condo | Annual_Increase_Condo | Annual_Rank_Condo | Annual_Value_Category_Condo | 5Year_BasePrice_ThreeBedRoom | 5Year_Percent_Increase_ThreeBedRoom | AnnualValue_ThreeBedRoom | Previous_AnnualValue_ThreeBedRoom | Annual_Increase_ThreeBedRoom | Annual_Rank_ThreeBedRoom | Annual_Value_Category_ThreeBedRoom | 5Year_BasePrice_TwoBedRoom | 5Year_Percent_Increase_TwoBedRoom | AnnualValue_TwoBedRoom | Previous_AnnualValue_TwoBedRoom | Annual_Increase_TwoBedRoom | Annual_Rank_TwoBedRoom | Annual_Value_Category_TwoBedRoom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | NaN | Donalds | Greenwood, SC | 34.222695 | -82.458727 | 2020 | NaN | NaN | NaN | 117403.000000 | 18.0 | 138137.000000 | 117403.000000 | 18.0 | 103.0 | Above 15 Percent | 115696.000000 | 18.0 | 136407.000000 | 115696.000000 | 18.0 | 102.0 | Above 15 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 116718.000000 | 17.0 | 136521.000000 | 116718.000000 | 17.0 | 85.5 | Above 15 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ABBEVILLE | SC | NaN | Donalds | Greenwood, SC | 34.222695 | -82.458727 | 2021 | NaN | NaN | NaN | 117403.000000 | 39.0 | 162819.000000 | 138137.000000 | 18.0 | 538.5 | Above 15 Percent | 115696.000000 | 39.0 | 160577.000000 | 136407.000000 | 18.0 | 540.5 | Above 15 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 116718.000000 | 35.0 | 157712.000000 | 136521.000000 | 16.0 | 851.5 | Above 15 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ABBEVILLE | SC | 45001.0 | Donalds | Greenwood, SC | 34.222695 | -82.458727 | 2022 | 7.8 | 0.0 | 7.9 | 117403.000000 | 56.0 | 183338.000000 | 162819.000000 | 13.0 | 285.0 | Above 12 Percent | 115696.000000 | 57.0 | 181376.000000 | 160577.000000 | 13.0 | 287.5 | Above 12 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 116718.000000 | 60.0 | 186346.000000 | 157712.000000 | 18.0 | 72.0 | Above 15 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ABBEVILLE | SC | 45001.0 | Donalds | Greenwood, SC | 34.222695 | -82.458727 | 2023 | 2.2 | 0.3 | 1.9 | 117403.000000 | 54.0 | 181153.000000 | 183338.000000 | -1.0 | 1828.0 | 2 Percent Decrease | 115696.000000 | 55.0 | 179578.000000 | 181376.000000 | -1.0 | 1828.5 | 2 Percent Decrease | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 116718.000000 | 67.0 | 195261.000000 | 186346.000000 | 5.0 | 701.0 | Above 5 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ABBEVILLE | SC | 45001.0 | Donalds | Greenwood, SC | 34.222695 | -82.458727 | 2024 | 5.0 | 0.4 | 4.6 | 117403.000000 | 58.0 | 184911.000000 | 181153.000000 | 2.0 | 1709.0 | Above 1 Percent | 115696.000000 | 58.0 | 183180.000000 | 179578.000000 | 2.0 | 1718.0 | Above 1 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 116718.000000 | 74.0 | 203177.000000 | 195261.000000 | 4.0 | 1174.0 | Above 3 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 5 | ADA | ID | NaN | Star | Boise City, ID | 43.450841 | -116.241035 | 2020 | NaN | NaN | NaN | 355784.733333 | 19.0 | 424153.000000 | 355784.733333 | 19.0 | 65.5 | Above 15 Percent | 357113.200000 | 19.0 | 426129.933333 | 357113.200000 | 19.0 | 68.0 | Above 15 Percent | 242491.25 | 15.0 | 279064.500 | 242491.250 | 15.0 | 22.5 | No Increase | 329725.866667 | 19.0 | 393318.666667 | 329725.866667 | 19.0 | 40.5 | Above 15 Percent | 269304.266667 | 20.0 | 324277.466667 | 269304.266667 | 20.0 | 77.0 | Above 15 Percent |
| 6 | ADA | ID | NaN | Star | Boise City, ID | 43.450841 | -116.241035 | 2021 | NaN | NaN | NaN | 355784.733333 | 58.0 | 560965.533333 | 424153.000000 | 32.0 | 33.0 | Above 15 Percent | 357113.200000 | 58.0 | 563613.466667 | 426129.933333 | 32.0 | 36.0 | Above 15 Percent | 242491.25 | 57.0 | 381267.625 | 279064.500 | 37.0 | 26.0 | Above 15 Percent | 329725.866667 | 57.0 | 517221.000000 | 393318.666667 | 32.0 | 40.0 | Above 15 Percent | 269304.266667 | 59.0 | 428172.533333 | 324277.466667 | 32.0 | 54.0 | Above 15 Percent |
| 7 | ADA | ID | 16001.0 | Star | Boise City, ID | 43.450841 | -116.241035 | 2022 | 15.6 | 3.0 | 12.3 | 355784.733333 | 53.0 | 545204.933333 | 560965.533333 | -3.0 | 2123.0 | 4 Percent Decrease | 357113.200000 | 53.0 | 547932.533333 | 563613.466667 | -3.0 | 2122.0 | 4 Percent Decrease | 242491.25 | 54.0 | 372473.125 | 381267.625 | -2.0 | 709.5 | 2 Percent Decrease | 329725.866667 | 53.0 | 503376.200000 | 517221.000000 | -3.0 | 1954.0 | 4 Percent Decrease | 269304.266667 | 55.0 | 417129.133333 | 428172.533333 | -3.0 | 1682.5 | 4 Percent Decrease |
| 8 | ADA | ID | 16001.0 | Star | Boise City, ID | 43.450841 | -116.241035 | 2023 | 7.1 | 1.2 | 5.9 | 355784.733333 | 50.0 | 532802.866667 | 545204.933333 | -2.0 | 1925.0 | 2 Percent Decrease | 357113.200000 | 50.0 | 535748.600000 | 547932.533333 | -2.0 | 1921.5 | 2 Percent Decrease | 242491.25 | 42.0 | 343148.875 | 372473.125 | -8.0 | 707.5 | 8 Percent Decrease | 329725.866667 | 50.0 | 494369.933333 | 503376.200000 | -2.0 | 1857.0 | 2 Percent Decrease | 269304.266667 | 50.0 | 402681.333333 | 417129.133333 | -3.0 | 1607.0 | 4 Percent Decrease |
| 9 | ADA | ID | 16001.0 | Star | Boise City, ID | 43.450841 | -116.241035 | 2024 | 5.9 | 1.1 | 4.8 | 355784.733333 | 54.0 | 549132.133333 | 532802.866667 | 3.0 | 1466.0 | Above 3 Percent | 357113.200000 | 55.0 | 552415.266667 | 535748.600000 | 3.0 | 1472.0 | Above 3 Percent | 242491.25 | 39.0 | 336176.500 | 343148.875 | -2.0 | 599.0 | 2 Percent Decrease | 329725.866667 | 54.0 | 509161.866667 | 494369.933333 | 3.0 | 1455.0 | Above 3 Percent | 269304.266667 | 52.0 | 409381.200000 | 402681.333333 | 2.0 | 1295.5 | Above 1 Percent |
| 10 | ADAIR | IA | NaN | Stuart | Des Moines-West Des Moines, IA | 41.330756 | -94.471059 | 2020 | NaN | NaN | NaN | 146856.333333 | 10.0 | 160897.666667 | 146856.333333 | 10.0 | 1332.5 | Above 10 Percent | 143621.666667 | 10.0 | 157272.000000 | 143621.666667 | 10.0 | 1337.0 | Above 10 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165108.000000 | 9.0 | 180129.500000 | 165108.000000 | 9.0 | 1432.5 | Above 8 Percent | 139233.000000 | 8.0 | 150896.000000 | 139233.000000 | 8.0 | 1352.0 | Above 8 Percent |
| 11 | ADAIR | IA | NaN | Stuart | Des Moines-West Des Moines, IA | 41.330756 | -94.471059 | 2021 | NaN | NaN | NaN | 146856.333333 | 23.0 | 180827.000000 | 160897.666667 | 12.0 | 1321.5 | Above 12 Percent | 143621.666667 | 23.0 | 177125.333333 | 157272.000000 | 13.0 | 1170.5 | Above 12 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165108.000000 | 26.0 | 207758.000000 | 180129.500000 | 15.0 | 981.5 | No Increase | 139233.000000 | 26.0 | 175802.000000 | 150896.000000 | 17.0 | 675.5 | Above 15 Percent |
| 12 | ADAIR | IA | 19001.0 | Stuart | Des Moines-West Des Moines, IA | 41.330756 | -94.471059 | 2022 | 21.7 | 0.4 | 21.2 | 146856.333333 | 40.0 | 205169.333333 | 180827.000000 | 13.0 | 285.0 | Above 12 Percent | 143621.666667 | 40.0 | 200701.666667 | 177125.333333 | 13.0 | 287.5 | Above 12 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165108.000000 | 40.0 | 231533.500000 | 207758.000000 | 11.0 | 832.5 | Above 10 Percent | 139233.000000 | 42.0 | 198035.000000 | 175802.000000 | 13.0 | 479.0 | Above 12 Percent |
| 13 | ADAIR | IA | 19001.0 | Stuart | Des Moines-West Des Moines, IA | 41.330756 | -94.471059 | 2023 | 4.1 | -0.9 | 5.0 | 146856.333333 | 38.0 | 202807.666667 | 205169.333333 | -1.0 | 1828.0 | 2 Percent Decrease | 143621.666667 | 38.0 | 198284.000000 | 200701.666667 | -1.0 | 1828.5 | 2 Percent Decrease | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165108.000000 | 39.0 | 229555.000000 | 231533.500000 | -1.0 | 1790.0 | 2 Percent Decrease | 139233.000000 | 38.0 | 192032.000000 | 198035.000000 | -3.0 | 1607.0 | 4 Percent Decrease |
| 14 | ADAIR | IA | 19001.0 | Stuart | Des Moines-West Des Moines, IA | 41.330756 | -94.471059 | 2024 | 0.1 | -0.9 | 1.1 | 146856.333333 | 47.0 | 216255.000000 | 202807.666667 | 7.0 | 338.5 | Above 5 Percent | 143621.666667 | 47.0 | 211577.666667 | 198284.000000 | 7.0 | 344.0 | Above 5 Percent | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 165108.000000 | 49.0 | 245273.500000 | 229555.000000 | 7.0 | 305.5 | Above 5 Percent | 139233.000000 | 50.0 | 208822.000000 | 192032.000000 | 9.0 | 100.0 | Above 8 Percent |
# Sort the dataframe by 'State', 'County', and 'Year'
sorted_df = merged_df.sort_values(by=['State', 'County', 'Year_Recorded'])
# Write the sorted dataframe to a CSV file
sorted_df.to_csv(os.path.join(dirpath, 'zillow_combined_zhvi.csv'), index=False)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Filter Nebraska data and relevant years
nebraska_df = merged_df[(merged_df['State'] == 'NE') & (merged_df['Year_Recorded'].between(2019, 2024))]
# Melt the dataframe to long format
nebraska_melted_df = nebraska_df.melt(id_vars=['Year_Recorded', 'County'],
value_vars=['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom'],
var_name='HomeType', value_name='AnnualValue')
# Convert Year to numeric (just in case it's not)
nebraska_melted_df['Year_Recorded'] = nebraska_melted_df['Year_Recorded'].astype(int)
# Set Seaborn style
sns.set(style="whitegrid")
# Create FacetGrid for county-wise visualization
g = sns.FacetGrid(nebraska_melted_df, col="County", col_wrap=4, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="Year_Recorded", y="AnnualValue", hue="HomeType", marker="o")
# Adjust legends and labels
g.add_legend()
g.set_axis_labels("Year", "Annual Value ($)")
g.set_titles(col_template="{col_name} County")
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Filter Nebraska data and relevant years
california_df = merged_df[(merged_df['State'] == 'CA') & (merged_df['Year_Recorded'].between(2019, 2024))]
# Melt the dataframe to long format
california_melted_df = california_df.melt(id_vars=['Year_Recorded', 'County'],
value_vars=['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom'],
var_name='HomeType', value_name='AnnualValue')
# Convert Year to numeric (just in case it's not)
california_melted_df['Year_Recorded'] = california_melted_df['Year_Recorded'].astype(int)
# Set Seaborn style
sns.set(style="whitegrid")
# Create FacetGrid for county-wise visualization
g = sns.FacetGrid(california_melted_df, col="County", col_wrap=4, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="Year_Recorded", y="AnnualValue", hue="HomeType", marker="o")
# Adjust legends and labels
g.add_legend()
g.set_axis_labels("Year", "Annual Value ($)")
g.set_titles(col_template="{col_name} County")
plt.title("Annual Value Increases across California Counties")
# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(12, 6))
sns.kdeplot(data=nebraska_melted_df, x="AnnualValue", hue="County", fill=True, alpha=0.3)
plt.title("Kernel Density Estimate of Annual Home Values (Nebraska Counties)")
plt.xlabel("Annual Home Value ($)")
plt.ylabel("Density")
plt.legend(title="County")
plt.show()
This Stacked area Plot for Nebraska shows consistent upward trend in annual home values across all categories. Dominance of All Homes and Single Family Homes as they form base of stack and show most significant growth. Also shows significant contribution from Two and Three Bed rooms.Condo category shows least amount of growth compared to other categories.Slope of stacked area suggesting steady and sustatined growth rate in home values.
# Group data by Year and sum the values for each home type
area_data = nebraska_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom',
'AnnualValue_ThreeBedRoom']].sum()
# Plot the stacked area plot
area_data.plot.area(stacked=True, figsize=(12, 6), colormap="coolwarm")
plt.title("Stacked Area Plot: Annual Home Value for Nebraska Increases (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Total Annual Value ($)")
plt.xticks(rotation=45)
plt.show()
# Group data by Year and calculate the mean for each home type
area_data = nebraska_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom',
'AnnualValue_ThreeBedRoom']].mean()
# Reset the index to prepare for melting
area_data.reset_index(inplace=True)
# Melt the dataframe to long format for easier plotting
melted_area_data = area_data.melt(id_vars='Year_Recorded',
var_name='HomeType',
value_name='AnnualValue')
melted_area_data['HomeType'] = melted_area_data['HomeType'].str.replace('AnnualValue_', '')
# Create a grouped bar plot with distinct colors
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_area_data, x='Year_Recorded', y='AnnualValue', hue='HomeType', palette='tab10')
# Set custom y-axis ticks and labels
plt.yticks(ticks=[50000, 100000, 150000, 200000], labels=['1', '2', '3', '4'])
# Add titles and labels
plt.title("Mean Annual Home Value for Nebraska (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Home Value (50k) interval")
# Adjust legend to the right side and lower
plt.legend(title="Home Type", bbox_to_anchor=(1.2, 0.5), loc='center right')
plt.xticks(rotation=45)
# Show the plot
plt.show()
area_data
| Year_Recorded | AnnualValue_AllHomes | AnnualValue_SingleFamily | AnnualValue_Condo | AnnualValue_TwoBedRoom | AnnualValue_ThreeBedRoom | |
|---|---|---|---|---|---|---|
| 0 | 2020 | 177474.575154 | 172517.263046 | 179463.958333 | 126276.855898 | 179030.370435 |
| 1 | 2021 | 193967.152737 | 188472.184144 | 201416.138889 | 140893.739311 | 198395.664257 |
| 2 | 2022 | 216364.318693 | 210144.002576 | 215266.583333 | 155791.905359 | 220881.031775 |
| 3 | 2023 | 222824.066206 | 216459.869567 | 226748.472222 | 162000.456708 | 228420.175757 |
| 4 | 2024 | 235226.804843 | 228646.313833 | 227612.680556 | 170177.036401 | 241818.987203 |
texas_df = merged_df[(merged_df['State'] == 'TX') & (merged_df['Year_Recorded'].between(2019, 2024))]
# Group data by Year and calculate the mean for each home type
area_data = texas_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom',
'AnnualValue_ThreeBedRoom']].mean()
# Reset the index to prepare for melting
area_data.reset_index(inplace=True)
# Melt the dataframe to long format for easier plotting
melted_area_data = area_data.melt(id_vars='Year_Recorded',
var_name='HomeType',
value_name='AnnualValue')
melted_area_data['HomeType'] = melted_area_data['HomeType'].str.replace('AnnualValue_', '')
# Create a grouped bar plot with distinct colors
plt.figure(figsize=(12, 6))
sns.barplot(data=melted_area_data, x='Year_Recorded', y='AnnualValue', hue='HomeType', palette='tab10')
# Add titles and labels
plt.title("Mean Annual Home Value for Texas (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Home Value (50k) interval")
# Adjust legend to the right side and lower
plt.legend(title="Home Type", bbox_to_anchor=(1.2, 0.5), loc='center right')
plt.xticks(rotation=45)
# Show the plot
plt.show()
area_data
| Year_Recorded | AnnualValue_AllHomes | AnnualValue_SingleFamily | AnnualValue_Condo | AnnualValue_TwoBedRoom | AnnualValue_ThreeBedRoom | |
|---|---|---|---|---|---|---|
| 0 | 2020 | 202722.128510 | 202145.706569 | 174742.141054 | 145588.813376 | 201051.667907 |
| 1 | 2021 | 237267.928786 | 236574.945322 | 212633.963695 | 177102.407229 | 241266.520110 |
| 2 | 2022 | 249959.277645 | 249202.798058 | 235949.709449 | 196493.159187 | 266939.151848 |
| 3 | 2023 | 244409.487229 | 243612.356229 | 234366.815257 | 194003.554493 | 265470.577329 |
| 4 | 2024 | 246921.150411 | 246151.022535 | 227616.099913 | 192820.408494 | 268915.913155 |
texas_df.groupby('Year_Recorded')[['AnnualValue_Condo', 'AnnualValue_SingleFamily',
'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom']].mean()
| AnnualValue_Condo | AnnualValue_SingleFamily | AnnualValue_TwoBedRoom | AnnualValue_ThreeBedRoom | |
|---|---|---|---|---|
| Year_Recorded | ||||
| 2020 | 174742.141054 | 202145.706569 | 145588.813376 | 201051.667907 |
| 2021 | 212633.963695 | 236574.945322 | 177102.407229 | 241266.520110 |
| 2022 | 235949.709449 | 249202.798058 | 196493.159187 | 266939.151848 |
| 2023 | 234366.815257 | 243612.356229 | 194003.554493 | 265470.577329 |
| 2024 | 227616.099913 | 246151.022535 | 192820.408494 | 268915.913155 |
# Group data by Year and sum the values for each home type
texas_df = merged_df[(merged_df['State'] == 'TX') & (merged_df['Year_Recorded'].between(2019, 2024))]
area_data = texas_df.groupby('Year_Recorded')[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom',
'AnnualValue_ThreeBedRoom']].sum()
# Plot the stacked area plot
area_data.plot.area(stacked=True, figsize=(12, 6), colormap="coolwarm")
plt.title("Stacked Area Plot: Annual Home Value for Texas Increases (2019-2024)")
plt.xlabel("Year")
plt.ylabel("Total Annual Value ($)")
plt.xticks(rotation=45)
plt.show()
Correlation Plot shows strong correlation between AllHomes , Single Family , Tow Bedroom and ThreeBedroom with Positive values and close to 1.This indicates when one of these annual categories increase the others tend to increase as well. This shows conistent overall trend ; when one segment is doing well others tend to follow.
Condos show weaker correlation with other categories across Nebraska. This suggest Condos are not much preferable.
import seaborn as sns
import matplotlib.pyplot as plt
# Create a new dataframe with just the columns needed for the correlation matrix
comparison_df = nebraska_df[['AnnualValue_AllHomes', 'AnnualValue_SingleFamily',
'AnnualValue_Condo', 'AnnualValue_TwoBedRoom', 'AnnualValue_ThreeBedRoom']]
# Calculate the correlation matrix
corr_matrix = comparison_df.corr()
# Plot heatmap for the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix for Nebraska Annual Value Increases")
plt.show()